TL;DR

The zinb model doesn’t seem to work well in the Fluidigm data. This is independent of normalization, meaning that both unnormalized data and normalized data (both with scran and full-quantile) lead to bad results.

PCA leads to better results, even though, as many people have shown, the first component is highly correlated to detection rate, even after normalization.

ZIFA seems to work well in the Fluidigm datasets, suggesting that we should be able to find a way to make zinb work as well.

One key difference between ZIFA and zinb is that in our model both \(\mu\) and \(\pi\) depend on \(W\), while in the ZIFA model only \(\mu\) does. We should consider a model that doesn’t have a \(W\) in \(\pi\).

It seems (preliminary analyses) that removing the intercept in V makes the weird W behavior go away. Still, I think that we should add the ability to remove W from \(\pi\) and see what is the difference.

High coverage data

Select high coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.

data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 6998

No normalization

Fit a zinb model on the high coverage data (no normalization).

fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
set.seed(23424)
print(system.time(zinb_high <- zinbFit(high, ncores = 3, K = 2)))
##    user  system elapsed 
## 683.473  84.752 297.057

Plot the results with cells colored according to their biological condition.

bio <- as.factor(colData(fluidigm_high)$Biological_Condition)
cl <- as.factor(colData(fluidigm_high)$Cluster2)

plot(zinb_high@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_high@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

Compared to PCA (of the log counts)

pca <- prcomp(t(log1p(high)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)

plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_high@W[,1], colSums(high>0),method="spearman")
## [1] -0.2168706
#second factor and the total number of detected genes in the cell
cor(zinb_high@W[,2], colSums(high>0),method="spearman")
## [1] 0.5244318
#first factor and the total number of counts in the cell
cor(zinb_high@W[,1], colSums(high),method="spearman")
## [1] -0.1518794
#second factor and the total number of counts in the cell
cor(zinb_high@W[,2], colSums(high),method="spearman")
## [1] 0.3468531

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(high>0), method="spearman")
## [1] 0.9232517
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(high>0), method="spearman")
## [1] 0.297028
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(high), method="spearman")
## [1] 0.1270105
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(high), method="spearman")
## [1] -0.3141171

Correlation between PCA and ZINB

cor(pca$x[,1], zinb_high@W[,1], method="spearman")
## [1] -0.2783654
cor(pca$x[,2], zinb_high@W[,1], method="spearman")
## [1] -0.01188811
cor(pca$x[,1], zinb_high@W[,2], method="spearman")
## [1] 0.7553322
cor(pca$x[,2], zinb_high@W[,2], method="spearman")
## [1] -0.4728147

Using scran size factors

sce <- newSCESet(countData=data.frame(high))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))

sf <- sizeFactors(sce)

norm <- exprs(convertTo(sce, type="monocle"))
fq <- betweenLaneNormalization(high, which="full")

plotRLE(high, outline=FALSE, col=cols[bio], main="Unnormalized counts")

plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")

plotRLE(fq, outline=FALSE, col=cols[bio], main="FQ normalization")

I’m not convinced that this is the best normalization, but since it’s the only one specifically proposed for scRNA-seq, it’s a good place to start.

set.seed(3525)
offsets <- matrix(rep(sf, NROW(high)), ncol=NROW(high), nrow=NCOL(high))
print(system.time(zinb_norm <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 789.406  95.665 341.431

Plot the results with cells colored according to their biological condition.

plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

qc <- as.matrix(colData(fluidigm_high)[,metadata(fluidigm_high)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)

quality <- qcpca$x[,1]
detection_rate <- colSums(high>0)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)

plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(high>0),method="spearman")
## [1] -0.2240385
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(high>0),method="spearman")
## [1] 0.4842657
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(high),method="spearman")
## [1] -0.1604021
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(high),method="spearman")
## [1] 0.3326923

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(high>0), method="spearman")
## [1] 0.9566434
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(high>0), method="spearman")
## [1] -0.1056818
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(high), method="spearman")
## [1] -0.07137238
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(high), method="spearman")
## [1] 0.4917832

Correlation between PCA and ZINB

cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] -0.2262675
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.004195804
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.6608392
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.6752622

Using FQ derived offsets

se <- newSeqExpressionSet(high)
se <- betweenLaneNormalization(se, which="full", offset=TRUE)
offsets <- t(offst(se))
set.seed(35235)
print(system.time(zinb_fq <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 875.791  98.163 370.769

Plot the results with cells colored according to their biological condition.

plot(zinb_fq@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_fq@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

data.frame(W1=zinb_fq@W[,1], W2=zinb_fq@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_fq <- prcomp(t(log1p(fq)))
plot(pca_fq$x, col=cols[bio], pch=19)
legend("topright", levels(bio), fill=cols)

plot(pca_fq$x, col=cols2[cl], pch=19)
legend("topright", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_fq@W[,1], colSums(high>0),method="spearman")
## [1] 0.08378497
#second factor and the total number of detected genes in the cell
cor(zinb_fq@W[,2], colSums(high>0),method="spearman")
## [1] 0.3754808
#first factor and the total number of counts in the cell
cor(zinb_fq@W[,1], colSums(high),method="spearman")
## [1] -0.1039336
#second factor and the total number of counts in the cell
cor(zinb_fq@W[,2], colSums(high),method="spearman")
## [1] 0.3515297

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_fq$x[,1], colSums(high>0), method="spearman")
## [1] 0.7178322
#second factor and the total number of detected genes in the cell
cor(pca_fq$x[,2], colSums(high>0), method="spearman")
## [1] 0.6338724
#first factor and the total number of counts in the cell
cor(pca_fq$x[,1], colSums(high), method="spearman")
## [1] 0.2144231
#second factor and the total number of counts in the cell
cor(pca_fq$x[,2], colSums(high), method="spearman")
## [1] -0.2588287

Correlation between PCA and ZINB

cor(pca_fq$x[,1], zinb_fq@W[,1], method="spearman")
## [1] -0.002097902
cor(pca_fq$x[,2], zinb_fq@W[,1], method="spearman")
## [1] 0.1466783
cor(pca_fq$x[,1], zinb_fq@W[,2], method="spearman")
## [1] 0.7437937
cor(pca_fq$x[,2], zinb_fq@W[,2], method="spearman")
## [1] -0.1988199
## write matrices to file to feed to ZIFA in python
write.csv(log1p(high), file="logcounts_high.csv")
write.csv(log1p(norm), file="logcounts_high_scran.csv")

Low coverage data

Select low coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.

fluidigm_low <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="Low")]
filter <- apply(assay(fluidigm_low)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 2506

No normalization

Fit a zinb model on the low coverage data (no normalization).

fluidigm_low <- fluidigm_low[filter,]
low <- assay(fluidigm_low)
set.seed(654)
print(system.time(zinb_low <- zinbFit(low, ncores = 3, K = 2)))
##    user  system elapsed 
## 403.672  67.831 177.612

Plot the results with cells colored according to their biological condition.

bio <- as.factor(colData(fluidigm_low)$Biological_Condition)
cl <- as.factor(colData(fluidigm_low)$Cluster2)

plot(zinb_low@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_low@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

Compared to PCA (of the log counts)

pca <- prcomp(t(log1p(low)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)

plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_low@W[,1], colSums(low>0),method="spearman")
## [1] 0.4277894
#second factor and the total number of detected genes in the cell
cor(zinb_low@W[,2], colSums(low>0),method="spearman")
## [1] -0.5360534
#first factor and the total number of counts in the cell
cor(zinb_low@W[,1], colSums(low),method="spearman")
## [1] 0.0005681818
#second factor and the total number of counts in the cell
cor(zinb_low@W[,2], colSums(low),method="spearman")
## [1] 0.2904283

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(low>0), method="spearman")
## [1] -0.6268781
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(low>0), method="spearman")
## [1] -0.03942437
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(low), method="spearman")
## [1] 0.06979895
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(low), method="spearman")
## [1] -0.474257

Correlation between PCA and ZINB

cor(pca$x[,1], zinb_low@W[,1], method="spearman")
## [1] -0.8020979
cor(pca$x[,2], zinb_low@W[,1], method="spearman")
## [1] -0.06770105
cor(pca$x[,1], zinb_low@W[,2], method="spearman")
## [1] 0.8125874
cor(pca$x[,2], zinb_low@W[,2], method="spearman")
## [1] -0.4075612

Using scran size factors

sce <- newSCESet(countData=data.frame(low))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))

sf <- sizeFactors(sce)

norm <- exprs(convertTo(sce, type="monocle"))

plotRLE(low, outline=FALSE, col=cols[bio], main="Unnormalized counts")

plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")

set.seed(3525)
offsets <- matrix(rep(sf, NROW(low)), ncol=NROW(low), nrow=NCOL(low))
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, O_mu = offsets)))
##    user  system elapsed 
## 279.736  55.275 126.172

Plot the results with cells colored according to their biological condition.

plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

qc <- as.matrix(colData(fluidigm_low)[,metadata(fluidigm_low)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)

quality <- qcpca$x[,1]
detection_rate <- colSums(low>0)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

Compared to PCA (of the log counts)

pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)

plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)

Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.

#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(low>0),method="spearman")
## [1] 0.02917491
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(low>0),method="spearman")
## [1] -0.5416043
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(low),method="spearman")
## [1] -0.03199301
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(low),method="spearman")
## [1] 0.1593969

Same for PCA

#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(low>0), method="spearman")
## [1] -0.8312772
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(low>0), method="spearman")
## [1] 0.5694024
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(low), method="spearman")
## [1] 0.4491259
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(low), method="spearman")
## [1] -0.5332168

Correlation between PCA and ZINB

cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] 0.2093969
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.4050699
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.859965
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.1836538

Include detection rate as covariates

x <- model.matrix(~scale(detection_rate))
set.seed(9948)
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, X = x, which_X_mu=1:2, which_X_pi=1L)))
##    user  system elapsed 
## 323.065  52.122 140.469
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)

data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()

## write matrices to file to feed to ZIFA in python
write.csv(log1p(low), file="logcounts_low.csv")
write.csv(log1p(norm), file="logcounts_low_scran.csv")
write.csv(log1p(low[1:5, 1:3]), file="logcounts_toy.csv")

ZIFA (low coverage, unnormalized data)

zifa_res <- read.csv("zifa_low.csv", header=FALSE)

plot(zifa_res, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)

Adding \(V_\pi\)

library(biomaRt)
mart <- useMart("ensembl")
mart <- useDataset("hsapiens_gene_ensembl", mart = mart)
attrs <- c("hgnc_symbol", "entrezgene")
bm <- getBM(attributes=attrs, mart = mart)
bm <- bm[match(rownames(low), bm[,1]),]
bm <- na.omit(bm)
low <- low[bm[,1],]

gene_info <- getGeneLengthAndGCContent(as.character(bm[,2]), "hsa", mode="org.db")
rownames(gene_info) <- bm[,1]

gene_info <- na.omit(gene_info)
low <- low[rownames(gene_info),]

NB: currently, there is no way to have no columns of X or V in either \(\mu\) or \(\pi\). We should let the user specify NULL?

V <- cbind(rep(0, NROW(low)), gene_info)
print(system.time(zinb_gc <- zinbFit(low, ncores = 3, K = 2, V = V, which_V_mu=1L, which_V_pi=2:3)))
##    user  system elapsed 
## 189.987  40.693  88.167
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)

Removing V intercept

print(system.time(zinb_nov <- zinbFit(low, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(low)))))
##    user  system elapsed 
## 246.570  47.265 111.521
plot(zinb_nov@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)

plot(zinb_nov@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)

Keeping it for \(\pi\)

V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
print(system.time(zinb_v <- zinbFit(low, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
##    user  system elapsed 
## 145.741  21.524  67.725
plot(zinb_v@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)

Keeping it for \(\mu\)

V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
print(system.time(zinb_v2 <- zinbFit(low, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
##    user  system elapsed 
## 320.539  40.633 145.828
plot(zinb_v2@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)

V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
X <- cbind(rep(0, NCOL(low)), rep(1, NCOL(low)))
print(system.time(zinb_vx <- zinbFit(low, ncores = 3, K = 2, V=V, X=X, which_V_mu=1L, which_V_pi=2L, which_X_mu=2L, which_X_pi=1L)))
##    user  system elapsed 
## 272.078  54.187 121.909
plot(zinb_vx@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)